Billionaires broken down

Silvie Cinková

2025-08-09

library(tidyverse)
billionaires_df <- read_tsv("../DATA.NPFL112/billionaires_combined.tsv")
billionaires_2020 <- billionaires_df %>% 
  filter(time == 2020) %>% select(daily_income, person, world_6region)

Billionaires in 2020

glimpse(billionaires_2020)
Rows: 2,462
Columns: 3
$ daily_income  <dbl> 186317, 408707, 164430, 286074, 124167, 314564, 2575542,…
$ person        <chr> "abdulla_al_futtaim", "abdulla_bin_ahmad_al_ghurair", "a…
$ world_6region <chr> "middle_east_north_africa", "middle_east_north_africa", …

The task performed by the code

  • Filter billionaires in 2020
  • Boxplots for world regions
  • Label outliers with person IDs

Solution

  • Identify outliers
  • Create a data frame of outliers
  • Plot the boxplot of the entire region
  • Over that, text plot with names of outliers
  • Bypass overlap by using ggrepel library
  • plot the upper outlier threshold with label

Identify outliers

  • group by region
  • upper threshold: Q3 + 1.5 IQR
  • lower threshold: Q1 - 1.5 IQR
  • is the value an outlier?

Code to identify outliers

billionaires_outliers <- billionaires_2020 %>% 
  group_by(world_6region) %>%
  mutate(Q3 = quantile(daily_income, 0.75), 
         IQR_col = IQR(daily_income), 
         outliers_above = Q3 + 1.5 * IQR_col) %>%
  mutate(is_outlier = if_else(condition = daily_income > outliers_above,
                              true = TRUE, 
                              false = FALSE)) %>% 
  arrange(desc(daily_income), world_6region) %>% ungroup()

Vectors of world regions and of upper thresholds

world_6region_vec <- distinct(billionaires_outliers, world_6region) %>% 
  arrange(world_6region) %>% 
  pull()  
outliers_above <- distinct(billionaires_outliers, world_6region, outliers_above) %>% 
  arrange(world_6region) %>% 
  pull() 

The (alphabetically) first world region

(world_region01 <- world_6region_vec[1])
[1] "america"

All Americans among billionaires

all_oneregion <- billionaires_outliers %>%
  filter(world_6region == world_6region_vec[1])  

All outliers among American billionaires

outliers_oneregion <- outliers_oneregion <- all_oneregion %>% 
    filter(is_outlier == TRUE)

Boxplot of all Americans

  ggplot(all_oneregion) + geom_boxplot(mapping = aes(y = daily_income, x = 1),
                 color = "purple")  

Boxplot of all Africans

(world_region04 <- world_6region_vec[4])
[1] "middle_east_north_africa"
billionaires_outliers %>%
  filter(world_6region == world_6region_vec[4]) %>% 
  ggplot() + geom_boxplot(mapping = aes(y = daily_income, x = 1),
                 color = "purple")  

Plot Americans with outlier labels

library(ggrepel)
set.seed(2331) # change and re-plot until you are happy
 p <- ggplot() + 
    geom_boxplot(data = all_oneregion,
                          mapping = aes(y = daily_income, x = 1),
                          color = "purple", outlier.size = 3)  +
    geom_text_repel(mapping = aes(y = daily_income, 
                                  x = 1,
                                  label = person,
                                  size = daily_income), 
                    data = outliers_oneregion,
                    max.overlaps = 123,  force = 7,
                    alpha = 0.7,
                    segment.alpha = 0.3,
                    segment.size = 0.2) 

Plot Americans with outlier labels

p

Prettify the plot

prettified_01 <- p + 
    scale_x_continuous(breaks = NULL, 
                       name = world_6region_vec[1]) +
    scale_size_continuous(range = c(3,5), 
                          breaks = seq(from = round(min(outliers_oneregion$daily_income)),
                                       to = round(max(outliers_oneregion$daily_income)),
                                       by = round(max(outliers_oneregion$daily_income) * 0.4))) +
    scale_y_continuous(breaks = seq(from = 10^6, 
                                    to = ceiling(round(
                                      max(all_oneregion$daily_income) * 10^(-6)) * 10^6), 
                                    by = 10^7),
                        labels = as.character(seq(from = 10^6, 
                                                  to = ceiling(round(
                                                    max(all_oneregion$daily_income) * 10^(-6)) * 10^6), 
                                                  by = 10^7)))
prettified_01
ggsave("images_ATRIUM/prettified_billionaires01.pdf", scale = 2, limitsize = FALSE)

The prettified plot

prettified_01 # renders poor in slides

Add a line where outliers start

prettified_02 <- prettified_01 +  geom_hline(yintercept = outliers_above[1], 
               color = "seagreen", 
               linewidth = 1, 
               linetype=3, 
               alpha = 0.4)
prettified_02

Prettified plot (from file)

Label the value at which observations become outliers

y_axis_offset_for_outlier_label <- 2700000 # manual adjustment
prettified_02 + annotate(geom = "text", x = 0.6, 
             y = outliers_above[1] + y_axis_offset_for_outlier_label, 
             label = outliers_above[1], 
             color = "seagreen", 
             size = 6,
             alpha = 0.7)

Make the position of the label adjustable to different plot scales

y_axis_offset_for_outlier_label <- range(all_oneregion$daily_income) %>% 
    diff()
  y_axis_offset_for_outlier_label <- y_axis_offset_for_outlier_label * 0.05

The if else condition

One of the plots contained no outliers. The condition says that, in case there are no outliers, we want all billionaire names plotted instead.

Whole code

library(ggrepel)
billionaires_outliers <- billionaires_2020 %>% 
  group_by(world_6region) %>%
  mutate(Q3 = quantile(daily_income, 0.75), 
         IQR_col = IQR(daily_income), 
         outliers_above = Q3 + 1.5 * IQR_col) %>%
  mutate(is_outlier = if_else(condition = daily_income > outliers_above,
                              true = TRUE, 
                              false = FALSE)) %>% 
  arrange(desc(daily_income), world_6region)


world_6region_vec <- distinct(billionaires_outliers, world_6region) %>% 
  arrange(world_6region) %>% 
  pull()  
outliers_above <- distinct(billionaires_outliers, world_6region, outliers_above) %>% 
  arrange(world_6region) %>% 
  pull() 

for (i in seq_along(world_6region_vec)) {
  all_oneregion <- billionaires_outliers %>% 
    filter(world_6region == world_6region_vec[i])
  outliers_oneregion <- all_oneregion %>% 
    filter(is_outlier == TRUE)
  y_axis_offset_for_outlier_label <- range(all_oneregion$daily_income) %>% 
    diff()
  y_axis_offset_for_outlier_label <- y_axis_offset_for_outlier_label * 0.05
  if (nrow(outliers_oneregion) == 0){
    cat(world_6region_vec[i], "has no outliers. I will plot all names.\n")
  p <- ggplot() + 
    geom_boxplot(data = all_oneregion,
                 mapping = aes(y = daily_income, x = 1),
                 color = "purple")   +
    geom_text_repel(mapping = aes(y = daily_income, 
                                  x = 1,
                                  label = person,
                            size = daily_income), 
                    data = all_oneregion,
                    max.overlaps = 100,  
                    force = 7,
                    alpha = 0.7,
                    segment.alpha = 0.2,
                    segment.size = 0.1) + 
    scale_x_continuous(breaks = NULL, 
                       name = world_6region_vec[i]) +
    scale_y_continuous(breaks = seq(from = 10^6, 
                                    to = ceiling(round(
                                    max(all_oneregion$daily_income) * 10^(-6)) * 10^6), 
                                    by = 10^7),
                       labels = as.character(
                                             seq(from = 10^6, 
                                                 to = ceiling(round(max(all_oneregion$daily_income) * 10^(-6)) * 10^6), 
                                                 by = 10^7)
                                   )
                       ) 
  ggsave(plot = p, filename = paste0("../my_output_files/outliers_billionaires_", world_6region_vec[i], ".pdf"),
         width = 7 * 2)
    
  } else {
  set.seed(155)
  p <- ggplot() + 
    geom_boxplot(data = all_oneregion,
                          mapping = aes(y = daily_income, x = 1),
                          color = "purple", outlier.size = 3)  +
    geom_text_repel(mapping = aes(y = daily_income, 
                                  x = 1,
                                  label = person,
                                  size = daily_income), 
                    data = outliers_oneregion,
                    max.overlaps = 100,  force = 7,
                    alpha = 0.7,
                    segment.alpha = 0.3,
                    segment.size = 0.2) + 
    scale_x_continuous(breaks = NULL, 
                       name = world_6region_vec[i]) +
    scale_size_continuous(range = c(6,10), 
                          breaks = seq(from = round(min(outliers_oneregion$daily_income)),
                                       to = round(max(outliers_oneregion$daily_income)),
                                       by = round(max(outliers_oneregion$daily_income) * 0.4))) +
    scale_y_continuous(breaks = seq(from = 10^6, 
                                    to = ceiling(round(
                                      max(all_oneregion$daily_income) * 10^(-6)) * 10^6), 
                                    by = 10^7),
                        labels = as.character(seq(from = 10^6, 
                                                  to = ceiling(round(
                                                    max(all_oneregion$daily_income) * 10^(-6)) * 10^6), 
                                                  by = 10^7))) +
    geom_hline(yintercept = outliers_above[i], 
               color = "seagreen", 
               linewidth = 1, 
               linetype=3, 
               alpha = 0.4) + 
    annotate(geom = "text", 
             x = 0.6, 
             y = outliers_above[i] + y_axis_offset_for_outlier_label, 
             label = outliers_above[i], 
             color = "seagreen", 
             size = 6) +
    theme(axis.text = element_text(size = 12), 
          axis.title = element_text(size = 14))
  cat(world_6region_vec[i], "\n") 
  print(p)
  ggsave(plot = p, filename = paste0(
    "../my_output_files/outliers_billionaires_", 
    world_6region_vec[i], ".pdf"),
         width = 7 * 2.2)
    
  }
}